Flight EDA

Author

Julie Neisler

Getting Set Up

Packages

# Accessing and Reading Data
library(googleCloudStorageR) # let's me pull in the data from GCP
library(readxl) # read data

# Data Manipulation
library(tidyverse) # Basic data manipulation function
library(janitor) # Advanced data manipulation function

# Dealing with Dates
library(lubridate) 
library(hms) 

# Graphing and Analysis
library(corrplot) # graph correlation plots
library(leaps) # fancy regressions
library(broom) # fancy regressions

# Quarto
library(kableExtra)

Data

glimpse(flight_data)
Rows: 10,683
Columns: 11
$ Airline         <chr> "IndiGo", "Air India", "Jet Airways", "IndiGo", "IndiG…
$ Date_of_Journey <chr> "24/03/2019", "1/05/2019", "9/06/2019", "12/05/2019", …
$ Source          <chr> "Banglore", "Kolkata", "Delhi", "Kolkata", "Banglore",…
$ Destination     <chr> "New Delhi", "Banglore", "Cochin", "Banglore", "New De…
$ Route           <chr> "BLR → DEL", "CCU → IXR → BBI → BLR", "DEL → LKO → BOM…
$ Dep_Time        <chr> "22:20", "05:50", "09:25", "18:05", "16:50", "09:00", …
$ Arrival_Time    <chr> "01:10 22 Mar", "13:15", "04:25 10 Jun", "23:30", "21:…
$ Duration        <chr> "2h 50m", "7h 25m", "19h", "5h 25m", "4h 45m", "2h 25m…
$ Total_Stops     <chr> "non-stop", "2 stops", "2 stops", "1 stop", "1 stop", …
$ Additional_Info <chr> "No info", "No info", "No info", "No info", "No info",…
$ Price           <dbl> 3897, 7662, 13882, 6218, 13302, 3873, 11087, 22270, 11…

Data Cleaning

Removing Missing Data and Data with No Variance

Amount of Missingness by Variable

flight_data %>% 
  summarise_all(~ sum(is.na(.))) %>%
  kbl(caption = "Missing Values by Variable") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), 
                full_width = FALSE, position = "left")
Missing Values by Variable
Airline Date_of_Journey Source Destination Route Dep_Time Arrival_Time Duration Total_Stops Additional_Info Price
0 0 0 0 1 0 0 0 1 0 0


A single flight had 4 stops, but it’s somehow gone. Rolling with it

flight_data %>% 
  filter(Total_Stops == "4 stops") %>%
  kbl(caption = "Flight with 4 stops") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), 
                full_width = FALSE, position = "left") %>%
  scroll_box(width = "100%")
Flight with 4 stops
Airline Date_of_Journey Source Destination Route Dep_Time Arrival_Time Duration Total_Stops Additional_Info Price
Air India 01/03/2019 Banglore New Delhi BLR → CCU → BBI → HYD → VGA → DEL 05:50 11:20 02 Mar 29h 30m 4 stops Change airports 17686


Check for missing Total_Stops

flight_data %>% 
  filter(is.na(Total_Stops)) %>%
  kbl(caption = "Flights with missing Total_Stops") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), 
                full_width = FALSE, position = "left")
Flights with missing Total_Stops
Airline Date_of_Journey Source Destination Route Dep_Time Arrival_Time Duration Total_Stops Additional_Info Price
Air India 6/05/2019 Delhi Cochin NA 09:45 09:25 07 May 23h 40m NA No info 7480


The row with 4 stops is an outlier, and the row with missing data can’t be extrapolated. Removing both.

flight_data <- flight_data %>% 
  filter(Total_Stops != "4 stops") %>%
  na.omit()

Updating Data Formats

flight_data$Airline <- as.factor(flight_data$Airline)
flight_data$Total_Stops <- factor(flight_data$Total_Stops, levels = c("non-stop", "1 stop", "2 stops", "3 stops", "4 stops"))
flight_data$Additional_Info <- as.factor(flight_data$Additional_Info)

Dates

Arrival time is date and time but departure time is just the time, so let’s get that fixed.

flight_data1 <- flight_data %>%
  mutate(departure_date_time = dmy_hms(paste0(Date_of_Journey, " ", Dep_Time, ":00"))) 

There is only a date for arrival if it was different than the departure date. Let’s get that fixed so that both departure and arrival have a full date and time set up.

flight_data2 <- flight_data1 %>%
  mutate(Arrival_Time_Only = str_extract(Arrival_Time, "^\\d{2}:\\d{2}"), # Pull out just time
         Dep_Time = paste0(Dep_Time, ":00"),
         Arrival_Time_Only = paste0(Arrival_Time_Only, ":00"),
         Arrival_Date = str_extract(Arrival_Time, "\\d{1,2} \\w{3}$") # Pull out just date
  ) %>%
  separate(Arrival_Date, into = c("Arrival_Day", "Arrival_Month"), sep = " ", remove = FALSE) %>%
  mutate(Arrival_Month_Num = case_when(Arrival_Month == "Jan" ~ 01, # replacing the words with numbers
                                       Arrival_Month == "Feb" ~ 02,
                                       Arrival_Month == "Mar" ~ 03,
                                       Arrival_Month == "Apr" ~ 04,
                                       Arrival_Month == "May" ~ 05,
                                       Arrival_Month == "Jun" ~ 06,
                                       Arrival_Month == "Jul" ~ 07,
                                       Arrival_Month == "Aug" ~ 08,
                                       Arrival_Month == "Sep" ~ 09,
                                       Arrival_Month == "Oct" ~ 10,
                                       Arrival_Month == "Nov" ~ 11,
                                       Arrival_Month == "Dec" ~ 12)) %>%
  separate(Date_of_Journey, # The dates don't have years, so pulling out the departure years as reference
           into = c("Departure_Day", "Departure_Month", "Departure_Year"), 
           sep = "/", 
           remove = FALSE) %>%
  mutate(Arrival_Day = ifelse(!is.na(Arrival_Day), Arrival_Day, as.numeric(Departure_Day)), 
         Arrival_Month_Num = ifelse(!is.na(Arrival_Month_Num), Arrival_Month_Num, as.numeric(Departure_Month)),
         Arrival_Date_combined = paste0(Departure_Year, "-", Arrival_Month_Num, "-", Arrival_Day)) %>%
  mutate(arrival_date_time = ymd_hms(paste0(Arrival_Date_combined, " ", Arrival_Time_Only))) %>%
  select(-Arrival_Month, -Arrival_Time) %>%
  rename(Departure_Time = Dep_Time,
         Arrival_Month = Arrival_Month_Num, 
         Arrival_Time = Arrival_Time_Only) %>%
  select(Airline, Date_of_Journey, 
         departure_date_time, Departure_Day, Departure_Month, Departure_Year, Departure_Time,
         arrival_date_time, Arrival_Day, Arrival_Month, Arrival_Time,
         Duration, 
         Source, Destination, Route, Total_Stops, Price, Additional_Info)

flight_data3 <- flight_data2

Making sure all the dates are formatted correctly

flight_data3$Departure_Day <- as.numeric(flight_data3$Departure_Day)
flight_data3$Departure_Month <- as.numeric(flight_data3$Departure_Month)
flight_data3$Departure_Year <- as.numeric(flight_data3$Departure_Year)
flight_data3$Arrival_Day <- as.numeric(flight_data3$Arrival_Day)

flight_data3$Departure_Time <- as_hms(flight_data3$Departure_Time)
flight_data3$Arrival_Time <- as_hms(flight_data3$Arrival_Time)

flight_data4 <- flight_data3

Let’s add features for day of week and hour of departure

flight_data4$departure_day_of_week <- weekdays(flight_data4$departure_date_time)
flight_data4$Departure_Hour = hour(flight_data4$Departure_Time)

I don’t need to care about time zones since all the cities are in the same time zone.

unique(flight_data4$Source)
[1] "Banglore" "Kolkata"  "Delhi"    "Chennai"  "Mumbai"  
unique(flight_data4$Destination)
[1] "New Delhi" "Banglore"  "Cochin"    "Kolkata"   "Delhi"     "Hyderabad"

Duration

First calculating duration

flight_data4 <- flight_data4 %>%
  mutate(
    Duration_Hours = replace_na(as.numeric(str_extract(Duration, "\\d+(?=h)")), 0) + 
      replace_na(as.numeric(str_extract(Duration, "\\d+(?=m)")), 0) / 60
  )

Then examining if calculated duration is the same as provided duration. There are a handful of flights that appear to be different. For some, it seems the arrival is before the departure.

duration_comparison <- flight_data4 %>%
  rowwise() %>%
  mutate(calculated_duration = arrival_date_time - departure_date_time,
         calculated_duration_hours = as.numeric(calculated_duration, units = "hours")
  ) %>%
  mutate(difference_between_calculation_file = Duration_Hours - calculated_duration_hours,
         abs_difference_between_calculation_file = abs(difference_between_calculation_file)) %>%
  filter(difference_between_calculation_file > 1) %>%
  select(departure_date_time, arrival_date_time, calculated_duration_hours, Duration_Hours, difference_between_calculation_file) %>%
  head(10)

duration_comparison %>%
  kbl(caption = "Flights with Duration Discrepancies", 
      col.names = c("Departure", "Arrival", "Calculated Hours", "File Hours", "Difference")) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), 
                full_width = FALSE, position = "left")
Flights with Duration Discrepancies
Departure Arrival Calculated Hours File Hours Difference
2019-03-24 22:20:00 2019-03-22 01:10:00 -69.16667 2.833333 72
2019-03-21 22:00:00 2019-03-19 13:20:00 -56.66667 15.333333 72
2019-03-18 14:05:00 2019-03-16 05:05:00 -57.00000 15.000000 72
2019-03-18 16:55:00 2019-03-16 09:00:00 -55.91667 16.083333 72
2019-03-21 22:00:00 2019-03-19 10:50:00 -59.16667 12.833333 72
2019-03-15 22:55:00 2019-03-13 05:05:00 -65.83333 6.166667 72
2019-03-21 11:50:00 2019-03-19 08:55:00 -50.91667 21.083333 72
2019-03-18 07:00:00 2019-03-16 07:40:00 -47.33333 24.666667 72
2019-03-18 11:05:00 2019-03-16 22:10:00 -36.91667 35.083333 72
2019-03-21 19:35:00 2019-03-19 00:35:00 -67.00000 5.000000 72


No time-based patterns to wonky durations.

flight_data4 %>%
  rowwise() %>%
  mutate(calculated_duration = arrival_date_time - departure_date_time,
         calculated_duration_hours = as.numeric(calculated_duration, units = "hours")
  ) %>%
  mutate(difference_between_calculation_file = Duration_Hours - calculated_duration_hours,
         abs_difference_between_calculation_file = abs(difference_between_calculation_file)) %>%
  filter(difference_between_calculation_file > 1) %>%
  ggplot(aes(x = departure_date_time)) +
  geom_histogram(bins = 30, fill = "steelblue", alpha = 0.7) +
  labs(title = "Distribution of Flights with Duration Discrepancies",
       x = "Departure Date/Time",
       y = "Count") +
  theme_minimal()

Remove data where the arrival was before the departure.

flight_data4 <- flight_data4 %>%
  rowwise() %>%
  mutate(wonky_times = case_when(arrival_date_time > departure_date_time ~ "keep",
                                 TRUE ~ "remove"
  )) %>%
  filter(wonky_times == "keep") %>%
  select(-wonky_times) %>%
  ungroup() 

Route Analysis

Let’s look at route structure

route_sample <- unique(flight_data4$Route)[1:10]
route_sample %>%
  kbl(caption = "Sample Routes") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), 
                full_width = FALSE, position = "left")
Sample Routes
x
CCU → IXR → BBI → BLR
DEL → LKO → BOM → COK
CCU → NAG → BLR
BLR → NAG → DEL
CCU → BLR
BLR → BOM → DEL
DEL → BOM → COK
DEL → BLR → COK
MAA → CCU
CCU → BOM → BLR

Parsing route into individual airport codes

flight_data5 <- flight_data4 %>%
  separate_wider_delim(
    cols = Route, 
    delim = " → ", 
    names = c("airport_code_departure", "airport_code_first_stop", 
              "airport_code_second_stop", "airport_code_third_stop", "airport_code_arrival"),
    too_few = "align_start",  # Fill missing values with NA on the right
    too_many = "drop",        # Drop extra stops beyond 5 airports
    cols_remove = FALSE
  ) %>%
  # Count the actual number of airports in each route
  mutate(num_airports = str_count(Route, " → ") + 1) %>%
  # Fix the column assignments based on actual number of airports
  mutate(
    # Fix arrival column - always the last airport
    airport_code_arrival = case_when(
      num_airports == 2 ~ airport_code_first_stop,   # 2 airports: departure → arrival
      num_airports == 3 ~ airport_code_second_stop,  # 3 airports: departure → stop → arrival  
      num_airports == 4 ~ airport_code_third_stop,   # 4 airports: departure → stop → stop → arrival
      TRUE ~ airport_code_arrival                    # 5 airports: departure → stop → stop → stop → arrival
    ),
    
    # Fix third stop column
    airport_code_third_stop = case_when(
      num_airports <= 4 ~ NA_character_,             # No third stop for routes with 4 or fewer airports
      TRUE ~ airport_code_third_stop                 # Keep third stop only for 5-airport routes
    ),
    
    # Fix second stop column  
    airport_code_second_stop = case_when(
      num_airports <= 3 ~ NA_character_,             # No second stop for routes with 3 or fewer airports
      TRUE ~ airport_code_second_stop                # Keep second stop for 4+ airport routes
    ),
    
    # Fix first stop column
    airport_code_first_stop = case_when(
      num_airports <= 2 ~ NA_character_,             # No first stop for non-stop flights
      TRUE ~ airport_code_first_stop                 # Keep first stop for 3+ airport routes
    )
  ) %>%
  select(-num_airports)


Verify the route parsing results

route_verification <- flight_data5 %>%
  select(Route, airport_code_departure, airport_code_first_stop, 
         airport_code_second_stop, airport_code_third_stop, airport_code_arrival) %>%
  # Show examples of different route types
  slice(c(1:5, 
          which(str_count(flight_data5$Route, " → ") == 1)[1:2],  # 2 airports
          which(str_count(flight_data5$Route, " → ") == 2)[1:2],  # 3 airports  
          which(str_count(flight_data5$Route, " → ") == 3)[1:2],  # 4 airports
          which(str_count(flight_data5$Route, " → ") == 4)[1:2]   # 5 airports (if any)
  )) %>%
  arrange(str_count(Route, " → "))

route_verification %>%
  kbl(caption = "Route Parsing Verification") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), 
                full_width = FALSE, position = "left") %>%
  scroll_box(width = "100%", height = "300px")
Route Parsing Verification
Route airport_code_departure airport_code_first_stop airport_code_second_stop airport_code_third_stop airport_code_arrival
CCU → BLR CCU NA NA NA BLR
CCU → BLR CCU NA NA NA BLR
CCU → BLR CCU NA NA NA BLR
CCU → NAG → BLR CCU NAG NA NA BLR
BLR → NAG → DEL BLR NAG NA NA DEL
CCU → NAG → BLR CCU NAG NA NA BLR
BLR → NAG → DEL BLR NAG NA NA DEL
CCU → IXR → BBI → BLR CCU IXR BBI NA BLR
DEL → LKO → BOM → COK DEL LKO BOM NA COK
CCU → IXR → BBI → BLR CCU IXR BBI NA BLR
DEL → LKO → BOM → COK DEL LKO BOM NA COK
DEL → RPR → NAG → BOM → COK DEL RPR NAG BOM COK
CCU → BBI → IXR → DEL → BLR CCU BBI IXR DEL BLR


Checking unique airports by category

cat("Source cities:", paste(unique(flight_data5$Source), collapse = ", "), "\n")
Source cities: Kolkata, Delhi, Banglore, Chennai, Mumbai 
cat("Destination cities:", paste(unique(flight_data5$Destination), collapse = ", "), "\n")
Destination cities: Banglore, Cochin, New Delhi, Kolkata, Delhi, Hyderabad 
cat("First stop airports:", paste(unique(flight_data5$airport_code_first_stop), collapse = ", "), "\n")
First stop airports: IXR, LKO, NAG, NA, BOM, BLR, AMD, PNQ, CCU, IDR, GAU, MAA, HYD, DEL, BHO, JAI, ATQ, JDH, BBI, GOI, BDQ, TRV, IXU, IXB, UDR, RPR, DED, VGA, COK, VNS, IXC, PAT, JLR, KNU, GWL, VTZ, NDC, IXZ, HBX, IXA, STV 

Checking if City and airport code match up perfectly

# Check departure alignment
departure_check <- flight_data5 %>%
  group_by(Source, airport_code_departure) %>%
  summarise(count = n(), .groups = "drop")

departure_check %>%
  kbl(caption = "Source City vs Departure Airport Alignment") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), 
                full_width = FALSE, position = "left")
Source City vs Departure Airport Alignment
Source airport_code_departure count
Banglore BLR 2109
Chennai MAA 380
Delhi DEL 4536
Kolkata CCU 2871
Mumbai BOM 693


But I’m also realizing that the Destination has New Delhi instead of Delhi, so need to change that first.

flight_data5 <- flight_data5 %>%
  mutate(Destination = ifelse(Destination == "New Delhi", "Delhi", Destination)) 

# Check arrival alignment
arrival_check <- flight_data5 %>%
  group_by(Destination, airport_code_arrival) %>%
  summarise(count = n(), .groups = "drop")

arrival_check %>%
  kbl(caption = "Destination City vs Arrival Airport Alignment") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), 
                full_width = FALSE, position = "left")
Destination City vs Arrival Airport Alignment
Destination airport_code_arrival count
Banglore BLR 2871
Cochin COK 4536
Delhi DEL 2109
Hyderabad HYD 693
Kolkata CCU 380
flight_data6 <- flight_data5

Exploratory Data Analysis

Routes

Visualizing flight routes by number of stops

flight_data6 %>%
  mutate(row = row_number()) %>%
  select(row, Total_Stops, airport_code_departure, airport_code_first_stop, airport_code_second_stop, airport_code_third_stop, airport_code_arrival) %>%
  pivot_longer(!row:Total_Stops, names_to = "stop_on_trip", values_to = "airport_code") %>%
  group_by(Total_Stops, stop_on_trip, airport_code) %>%
  mutate(count = n()) %>%
  mutate(stop_on_trip = factor(stop_on_trip, levels = c("airport_code_departure", 
                                                        "airport_code_first_stop", 
                                                        "airport_code_second_stop", 
                                                        "airport_code_third_stop",
                                                        "airport_code_arrival"))) %>%
  filter(!is.na(airport_code)) %>%
  ggplot(aes(x = stop_on_trip,  y = reorder(airport_code,count), group = row)) +
  geom_point(alpha = 0.6) +
  geom_line(aes(color = count), alpha = 0.7) +
  labs(title = "Flight Routes by Number of Stops",
       x = "Stop on Trip", 
       y = "Airport Code", 
       color = "Count of Flights\non that Path") +
  facet_wrap(.~Total_Stops, scales = "free") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 30, vjust = .8, hjust = .8))

Price Outliers

Number of Price Outliers

nrow(rstatix::identify_outliers(data = flight_data6, variable = "Price"))
[1] 90

Visualizations of outliers

flight_data6 %>%
  ggplot(aes(x = Price)) +
  geom_boxplot(fill = "lightblue", alpha = 0.7) +
  labs(title = "Distribution of Flight Prices",
       x = "Price") +
  theme_minimal() +
  theme(axis.text.y = element_blank(),
        axis.ticks.y = element_blank())

An outlier is a value 1.5 times that of the IQR. Below, we see outliers are prices above $23,170.

outlier_summary <- rstatix::identify_outliers(data = flight_data6, variable = "Price")

# Create a more readable summary table
outlier_summary %>%
  select(Airline, Date_of_Journey, departure_date_time, Price, is.outlier, is.extreme) %>%
  head(10) %>%
  kbl(caption = "Sample of Price Outliers",
      col.names = c("Airline", "Journey Date", "Departure Time", "Price", "Outlier", "Extreme")) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), 
                full_width = FALSE, position = "left")
Sample of Price Outliers
Airline Journey Date Departure Time Price Outlier Extreme
Air India 1/03/2019 2019-03-01 23:00:00 27430 TRUE FALSE
Multiple carriers 1/03/2019 2019-03-01 12:50:00 36983 TRUE TRUE
Jet Airways 01/03/2019 2019-03-01 08:55:00 26890 TRUE FALSE
Jet Airways 01/03/2019 2019-03-01 07:00:00 26890 TRUE FALSE
Air India 6/03/2019 2019-03-06 09:40:00 25139 TRUE FALSE
Jet Airways Business 01/03/2019 2019-03-01 05:45:00 52229 TRUE TRUE
Air India 01/03/2019 2019-03-01 08:50:00 26743 TRUE FALSE
Jet Airways 01/03/2019 2019-03-01 05:45:00 26890 TRUE FALSE
Jet Airways 01/03/2019 2019-03-01 22:50:00 25735 TRUE FALSE
Jet Airways 01/03/2019 2019-03-01 05:45:00 27992 TRUE FALSE

Removing the outliers.

flight_data7 <- flight_data6 %>%
  filter(Price < 23170)

Correlation Matrix

Including the variables in my correlation Matrix

corr.data <- flight_data7 %>%
  mutate(weekend = case_when(departure_day_of_week == "Sunday" | departure_day_of_week == "Saturday" ~ "Weekend",
                             TRUE ~ "Weekday"),
         number_of_stops = case_when(Total_Stops == "non-stop" ~ 0,
                                     Total_Stops == "1 stop" ~ 1,
                                     Total_Stops == "2 stops" ~ 2,
                                     Total_Stops == "3 stops" ~ 3,
                                     Total_Stops == "4 stops" ~ 4),
         premium_economy = str_detect(Airline, "Premium")) %>%
  select(Airline, premium_economy, departure_date_time, Departure_Day, Departure_Month, Departure_Time,
         departure_day_of_week, arrival_date_time, Arrival_Day, Arrival_Month, Arrival_Time,
         weekend, Route, number_of_stops, airport_code_departure, airport_code_first_stop, 
         airport_code_second_stop, airport_code_third_stop, airport_code_arrival,
         Duration_Hours, Price) %>%
  data.matrix()

corr_matrix <- round(cor(corr.data, use="pairwise.complete.obs", method="pearson"), 2)

corrplot(corr_matrix, method="circle", type="lower", tl.col = "black", tl.cex = .65,
         tl.srt = 45, title = "Correlation Matrix of Flight Variables")

Let’s look at just the Price relationships.

corr_matrix_df <- rstatix::cor_gather(corr_matrix)

price_correlations <- corr_matrix_df %>%
  filter(var1 == 'Price') %>%
  arrange(desc(abs(cor))) %>%
  mutate(cor = round(cor, 3))

price_correlations %>%
  kbl(caption = "Price Correlations with Other Variables",
      col.names = c("Variable 1", "Variable 2", "Correlation")) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), 
                full_width = FALSE, position = "left") %>%
  scroll_box(width = "100%", height = "400px")
Price Correlations with Other Variables
Variable 1 Variable 2 Correlation
Price Price 1.00
Price number_of_stops 0.67
Price Duration_Hours 0.58
Price airport_code_departure 0.27
Price Route 0.26
Price airport_code_second_stop -0.22
Price airport_code_arrival -0.17
Price airport_code_first_stop -0.14
Price Departure_Day -0.12
Price Arrival_Day -0.10
Price departure_date_time -0.08
Price departure_day_of_week 0.08
Price arrival_date_time -0.07
Price Departure_Month -0.05
Price Arrival_Month -0.05
Price weekend 0.05
Price Airline -0.04
Price Arrival_Time 0.04
Price premium_economy 0.02
Price airport_code_third_stop 0.01
Price Departure_Time 0.00

Number of stops and duration are easy enough to figure out. Route and second airport code and destination are all pretty high up there. There’s something about these variables that has some influence but I’m not sure what.

Graphing Strongest Relationships

Price by total stops

flight_data7 %>%
  filter(!is.na(Total_Stops)) %>%
  mutate(number_of_stops = case_when(Total_Stops == "non-stop" ~ 0,
                                     Total_Stops == "1 stop" ~ 1,
                                     Total_Stops == "2 stops" ~ 2,
                                     Total_Stops == "3 stops" ~ 3,
                                     Total_Stops == "4 stops" ~ 4)) %>%
  ggplot(aes(x = number_of_stops, y = Price)) +
  geom_point(position = 'jitter', alpha = 0.6) +
  geom_smooth(method = 'lm', color = "red", se = TRUE) +
  labs(title = "Flight Price by Number of Stops",
       x = "Number of Stops",
       y = "Price ($)") +
  theme_minimal() +
  scale_x_continuous(breaks = 0:3, labels = c("Non-stop", "1 stop", "2 stops", "3 stops"))

Duration of flight

flight_data7 %>%
  filter(!is.na(Duration_Hours)) %>%
  ggplot(aes(x = Duration_Hours, y = Price)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = 'lm', color = "red", se = TRUE) +
  labs(title = "Flight Price by Duration",
       x = "Duration (Hours)",
       y = "Price ($)") +
  theme_minimal()

Both number of stops and duration

flight_data7 %>%
  filter(!is.na(Duration_Hours)) %>%
  filter(!is.na(Total_Stops)) %>%
  ggplot(aes(x = Duration_Hours, y = Price, color = Total_Stops)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = 'lm', se = TRUE) +
  labs(title = "Flight Price by Duration and Number of Stops",
       x = "Duration (Hours)",
       y = "Price ($)",
       color = "Total Stops") +
  theme_minimal() +
  theme(legend.position = "right")

Route Analysis

Well there is a clear impact of route, but not necessarily correlated with number of stops.

flight_data7 %>%
  filter(!is.na(Route)) %>%
  ggplot(aes(x = reorder(Route, Price), y = Price, fill = Total_Stops)) +
  geom_boxplot(alpha = 0.7) +
  labs(title = "Flight Price by Route",
       x = "Route",
       y = "Price ($)",
       fill = "Total Stops") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  coord_flip() 

Bringing multiple variables together

flight_data7 %>%
  ggplot(aes(x = Duration_Hours, y = Price)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = 'lm', color = "black", se = TRUE) +
  facet_grid(rows = vars(Destination),
             cols = vars(Total_Stops)) +
  labs(title = "Flight Price by Duration, Stops, and Destination",
       x = "Duration (Hours)",
       y = "Price ($)") +
  theme_minimal() +
  theme(strip.text = element_text(size = 9))

Regressions like a statistician

Approach 1

As a statistician, I would pick the variables that have the strongest relationships with the outcome, don’t have too much multicollinearity, and a theoretical reason to influence Price.

From below, we can see that the number of stops significantly predicts Price when controlling for other variables, as does duration, departure airport, and route.

regression.flight <- lm(Price ~ Total_Stops + Duration_Hours +
                          airport_code_departure + Route, data = flight_data7)

# Create a tidy summary
regression_summary <- tidy(regression.flight) %>%
  mutate(across(where(is.numeric), ~ round(.x, 4)))

regression_summary %>%
  kbl(caption = "Statistician's Regression Model Results",
      col.names = c("Term", "Estimate", "Std Error", "t-statistic", "p-value")) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), 
                full_width = FALSE, position = "left") %>%
  scroll_box(width = "100%", height = "400px")
Statistician's Regression Model Results
Term Estimate Std Error t-statistic p-value
(Intercept) 9701.6109 869.0608 11.1633 0.0000
Total_Stops1 stop 1139.4231 649.9356 1.7531 0.0796
Total_Stops2 stops 7121.5517 617.9348 11.5248 0.0000
Total_Stops3 stops 3959.2440 644.0748 6.1472 0.0000
Duration_Hours 19.1752 4.8461 3.9568 0.0001
airport_code_departureBOM 5674.6212 2686.5043 2.1123 0.0347
airport_code_departureCCU -4647.0958 2548.0820 -1.8238 0.0682
airport_code_departureDEL -3445.8404 852.8178 -4.0405 0.0001
airport_code_departureMAA -4960.3006 878.6765 -5.6452 0.0000
RouteBLR → BBI → DEL 381.3064 1253.5718 0.3042 0.7610
RouteBLR → BDQ → DEL -1793.5685 1105.2300 -1.6228 0.1047
RouteBLR → BOM → AMD → DEL -2553.8200 1516.1433 -1.6844 0.0921
RouteBLR → BOM → BHO → DEL -4658.0159 1515.8505 -3.0729 0.0021
RouteBLR → BOM → DEL 2349.5586 602.8213 3.8976 0.0001
RouteBLR → BOM → IDR → DEL -955.8349 1614.6382 -0.5920 0.5539
RouteBLR → BOM → IDR → GWL → DEL -849.1121 2040.1473 -0.4162 0.6773
RouteBLR → BOM → IXC → DEL -4077.8426 2686.9313 -1.5177 0.1291
RouteBLR → BOM → JDH → DEL -2248.9786 1446.5784 -1.5547 0.1201
RouteBLR → BOM → NAG → DEL -4525.7654 1446.7210 -3.1283 0.0018
RouteBLR → BOM → UDR → DEL -3287.3787 2687.2230 -1.2233 0.2212
RouteBLR → CCU → BBI → DEL -7973.6882 1515.7022 -5.2607 0.0000
RouteBLR → CCU → BBI → HYD → DEL -1497.8441 2040.1211 -0.7342 0.4628
RouteBLR → CCU → DEL 1134.2249 949.0002 1.1952 0.2320
RouteBLR → CCU → GAU → DEL -5349.1850 1297.1851 -4.1237 0.0000
RouteBLR → COK → DEL -1410.5535 924.1852 -1.5263 0.1270
RouteBLR → DEL -4221.5163 871.8087 -4.8423 0.0000
RouteBLR → GAU → DEL 1984.6260 1848.3272 1.0737 0.2830
RouteBLR → GOI → DEL -2238.4595 1105.0790 -2.0256 0.0428
RouteBLR → HBX → BOM → AMD → DEL -3316.3598 2691.1500 -1.2323 0.2179
RouteBLR → HBX → BOM → BHO → DEL 315.2278 2691.2195 0.1171 0.9068
RouteBLR → HBX → BOM → NAG → DEL -1735.8957 2690.1871 -0.6453 0.5188
RouteBLR → HYD → DEL -4497.5952 716.4807 -6.2773 0.0000
RouteBLR → HYD → VGA → DEL -5779.7061 1355.3550 -4.2643 0.0000
RouteBLR → IDR → DEL -3325.2319 1254.5534 -2.6505 0.0080
RouteBLR → LKO → DEL -3597.7831 1547.2066 -2.3253 0.0201
RouteBLR → MAA → DEL -809.4858 766.6789 -1.0558 0.2911
RouteBLR → NAG → DEL 380.8836 1547.2066 0.2462 0.8056
RouteBLR → PNQ → DEL -1949.0880 1371.1702 -1.4215 0.1552
RouteBLR → STV → DEL -5173.3329 1849.0554 -2.7978 0.0052
RouteBLR → TRV → COK → DEL -5511.6829 2035.7607 -2.7074 0.0068
RouteBLR → VGA → DEL -1868.5479 1053.7934 -1.7732 0.0762
RouteBLR → VGA → HYD → DEL -7433.7424 1614.2328 -4.6051 0.0000
RouteBLR → VGA → VTZ → DEL -1601.1831 2035.7837 -0.7865 0.4316
RouteBOM → AMD → ISK → HYD -11009.7871 2863.9963 -3.8442 0.0001
RouteBOM → BBI → HYD -2575.8508 3608.8686 -0.7138 0.4754
RouteBOM → BDQ → DEL → HYD -1654.8968 3506.8814 -0.4719 0.6370
RouteBOM → BHO → DEL → HYD -9907.5890 2651.0803 -3.7372 0.0002
RouteBOM → BLR → CCU → BBI → HYD -5450.9912 3606.3570 -1.5115 0.1307
RouteBOM → BLR → HYD -7860.5518 2987.3867 -2.6312 0.0085
RouteBOM → CCU → HYD -138.8818 3608.9699 -0.0385 0.9693
RouteBOM → COK → MAA → HYD -13136.1754 3506.9912 -3.7457 0.0002
RouteBOM → DEL → HYD -3460.7634 2657.3710 -1.3023 0.1928
RouteBOM → GOI → HYD -8973.6757 3609.4924 -2.4861 0.0129
RouteBOM → GOI → PNQ → HYD -5379.7657 2773.5812 -1.9396 0.0524
RouteBOM → HYD -11467.8695 2555.2488 -4.4880 0.0000
RouteBOM → IDR → DEL → HYD -5963.3195 3037.0389 -1.9635 0.0496
RouteBOM → JAI → DEL → HYD -5182.1958 3507.0511 -1.4777 0.1395
RouteBOM → JDH → DEL → HYD -148.8352 3507.1254 -0.0424 0.9662
RouteBOM → JDH → JAI → DEL → HYD -1609.7437 3605.7618 -0.4464 0.6553
RouteBOM → JLR → HYD -3121.8510 3609.4035 -0.8649 0.3871
RouteBOM → MAA → HYD -8198.5588 2987.5613 -2.7442 0.0061
RouteBOM → NDC → HYD -12437.9748 3609.9132 -3.4455 0.0006
RouteBOM → RPR → VTZ → HYD -12935.9590 3507.5272 -3.6881 0.0002
RouteBOM → UDR → DEL → HYD NA NA NA NA
RouteCCU → AMD → BLR 2043.1406 2589.9789 0.7889 0.4302
RouteCCU → BBI → BLR -266.8938 2504.3817 -0.1066 0.9151
RouteCCU → BBI → BOM → BLR -2123.0923 2650.5472 -0.8010 0.4231
RouteCCU → BBI → HYD → BLR -6814.3357 2750.0976 -2.4779 0.0132
RouteCCU → BBI → IXR → DEL → BLR 2916.3136 3158.5506 0.9233 0.3559
RouteCCU → BLR -585.1953 2564.6079 -0.2282 0.8195
RouteCCU → BOM → AMD → BLR 1864.8727 2749.8279 0.6782 0.4977
RouteCCU → BOM → BLR 5001.8441 2481.4049 2.0157 0.0439
RouteCCU → BOM → COK → BLR 234.0469 2661.9103 0.0879 0.9299
RouteCCU → BOM → GOI → BLR -692.7354 2694.6428 -0.2571 0.7971
RouteCCU → BOM → HBX → BLR -2509.8879 2811.1908 -0.8928 0.3720
RouteCCU → BOM → PNQ → BLR -2434.3810 2900.9232 -0.8392 0.4014
RouteCCU → BOM → TRV → BLR -254.0459 3155.3647 -0.0805 0.9358
RouteCCU → DEL → AMD → BLR 1151.5338 2677.9452 0.4300 0.6672
RouteCCU → DEL → BLR 4218.6197 2482.6556 1.6992 0.0893
RouteCCU → DEL → COK → BLR 985.3226 2654.9283 0.3711 0.7105
RouteCCU → DEL → COK → TRV → BLR 3443.3161 2904.0334 1.1857 0.2358
RouteCCU → DEL → VGA → BLR -146.1670 2901.7377 -0.0504 0.9598
RouteCCU → GAU → BLR 1189.2654 2509.9472 0.4738 0.6356
RouteCCU → GAU → DEL → BLR 1300.5552 2652.6339 0.4903 0.6239
RouteCCU → GAU → IMF → DEL → BLR 4185.8371 2850.3849 1.4685 0.1420
RouteCCU → HYD → BLR -1442.7784 2503.0168 -0.5764 0.5643
RouteCCU → IXA → BLR 3110.4021 3506.8233 0.8870 0.3751
RouteCCU → IXB → BLR 2961.1469 2540.9370 1.1654 0.2439
RouteCCU → IXB → DEL → BLR 2476.7584 3610.9686 0.6859 0.4928
RouteCCU → IXR → BBI → BLR -4950.4482 2810.6291 -1.7613 0.0782
RouteCCU → IXR → DEL → BLR -1583.6284 2647.9823 -0.5981 0.5498
RouteCCU → IXZ → MAA → BLR -1285.2519 3610.8998 -0.3559 0.7219
RouteCCU → JAI → BOM → BLR -1663.9833 2711.2240 -0.6137 0.5394
RouteCCU → JAI → DEL → BLR -372.8776 2810.7453 -0.1327 0.8945
RouteCCU → KNU → BLR 2290.4413 2600.7423 0.8807 0.3785
RouteCCU → MAA → BLR -1123.7456 2498.7520 -0.4497 0.6529
RouteCCU → NAG → BLR -18.4708 2613.8318 -0.0071 0.9944
RouteCCU → PAT → BLR 4533.0835 2600.7317 1.7430 0.0814
RouteCCU → PNQ → BLR 381.2274 2544.1226 0.1498 0.8809
RouteCCU → RPR → HYD → BLR -6180.1286 3608.8546 -1.7125 0.0868
RouteCCU → VNS → DEL → BLR 2670.2839 2737.0685 0.9756 0.3293
RouteCCU → VTZ → BLR NA NA NA NA
RouteDEL → AMD → BOM → COK -1113.9883 609.5917 -1.8274 0.0677
RouteDEL → AMD → COK -1908.9667 798.0112 -2.3922 0.0168
RouteDEL → ATQ → BOM → COK 1306.1964 711.0793 1.8369 0.0663
RouteDEL → BBI → COK 3553.9529 1273.1521 2.7915 0.0053
RouteDEL → BDQ → BOM → COK -1419.0583 749.5697 -1.8932 0.0584
RouteDEL → BHO → BOM → COK 408.4667 682.9797 0.5981 0.5498
RouteDEL → BLR → COK -110.8238 644.1547 -0.1720 0.8634
RouteDEL → BOM → COK 3263.1570 624.1415 5.2282 0.0000
RouteDEL → CCU → BOM → COK 94.8335 693.7997 0.1367 0.8913
RouteDEL → COK NA NA NA NA
RouteDEL → DED → BOM → COK 5701.1730 1848.2657 3.0846 0.0020
RouteDEL → GOI → BOM → COK -2189.5779 669.7220 -3.2694 0.0011
RouteDEL → GWL → IDR → BOM → COK 6864.8105 1069.5949 6.4181 0.0000
RouteDEL → HYD → BOM → COK -2122.7315 671.9196 -3.1592 0.0016
RouteDEL → HYD → COK -279.3114 638.2682 -0.4376 0.6617
RouteDEL → HYD → MAA → COK -4593.4733 645.6063 -7.1150 0.0000
RouteDEL → IDR → BOM → COK -282.8132 636.6398 -0.4442 0.6569
RouteDEL → IXC → BOM → COK -1571.9349 902.6048 -1.7416 0.0816
RouteDEL → IXU → BOM → COK 4922.1676 981.6628 5.0141 0.0000
RouteDEL → JAI → BOM → COK -1472.0712 606.8210 -2.4259 0.0153
RouteDEL → JDH → BOM → COK 2474.6039 713.9009 3.4663 0.0005
RouteDEL → LKO → BOM → COK -522.1773 701.2905 -0.7446 0.4565
RouteDEL → LKO → COK -336.6429 867.8025 -0.3879 0.6981
RouteDEL → MAA → BOM → COK -4728.6995 852.0184 -5.5500 0.0000
RouteDEL → MAA → COK -442.2079 655.1116 -0.6750 0.4997
RouteDEL → NAG → BOM → COK -1471.1202 654.6183 -2.2473 0.0246
RouteDEL → PNQ → COK -882.6067 733.8771 -1.2027 0.2291
RouteDEL → RPR → NAG → BOM → COK NA NA NA NA
RouteDEL → TRV → COK NA NA NA NA
RouteDEL → UDR → BOM → COK NA NA NA NA
RouteMAA → CCU NA NA NA NA
# Model summary statistics
model_stats <- glance(regression.flight) %>%
  select(r.squared, adj.r.squared, sigma, statistic, p.value, df, nobs) %>%
  mutate(across(where(is.numeric), ~ round(.x, 4)))

model_stats %>%
  kbl(caption = "Model Summary Statistics") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), 
                full_width = FALSE, position = "left")
Model Summary Statistics
r.squared adj.r.squared sigma statistic p.value df nobs
0.6332 0.6288 2479.698 143.2589 0 125 10499

Regressions like a Data Scientist

If I free myself from needing to rely on theory to make predictions, I would go with an All Subsets regression approach, which essentially tests every combination of possible variables, calculates the variance predicted and various model fit statistics (AIC, BIC, etc.) and selects the model with the highest predicted variances and lowest model fit statistics.

While I’ve done this analysis before in SPSS, I had not yet done it in R, so I had Claude help me write the code.

Preparing Data for All Subsets Regression

flight_data8 <- flight_data7 %>%
  mutate(number_of_stops = case_when(Total_Stops == "non-stop" ~ 0,
                                     Total_Stops == "1 stop" ~ 1,
                                     Total_Stops == "2 stops" ~ 2,
                                     Total_Stops == "3 stops" ~ 3,
                                     Total_Stops == "4 stops" ~ 4),
         weekend = case_when(departure_day_of_week == "Sunday" | departure_day_of_week == "Saturday" ~ "Weekend",
                             TRUE ~ "Weekday"))

Adding random variables for comparison

Now I’m going to add a random categorical and a random continuous feature to my variable

flight_data8$random_categorical <- sample(c('red', 'orange', 'yellow', 'green', 'blue'), nrow(flight_data8), replace=TRUE)
flight_data8$random_continuous <- sample(1:9, nrow(flight_data8), replace = TRUE)/10

Variable Selection via Standardized Regression

First, I identify the variables in the model.

I’m going to do that by running a regular standardized linear regression. Whichever variables have higher standardized values than the random variables will not be included.

options(scipen = 999)

regression.flight <- lm(Price ~ 
                        # Continuous
                          scale(Departure_Day) + scale(Departure_Month) + scale(Departure_Hour) +
                          scale(number_of_stops) + scale(Duration_Hours) +
                          scale(random_continuous) +
                        # Dichotomous
                          weekend +
                        # Categorical
                          Airline + random_categorical +
                          airport_code_departure + airport_code_first_stop + airport_code_second_stop + 
                          airport_code_third_stop + airport_code_arrival +
                        # Dates 
                          departure_date_time + arrival_date_time, 
                        data = flight_data8)

# Look at those regression coefficients
regression_coefficients <- data.frame(regression.flight$coefficients)

regression_coefficients %>% 
  mutate(abs_coef = abs(regression.flight.coefficients)) %>%
  filter(!is.na(abs_coef)) %>%
  arrange(abs_coef) %>%
  head(20) %>%
  kbl(caption = "Smallest Regression Coefficients (Bottom 20)",
      col.names = c("Coefficient", "Absolute Value")) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), 
                full_width = FALSE, position = "left")
Smallest Regression Coefficients (Bottom 20)
Coefficient Absolute Value
departure_date_time -0.0034778 0.0034778
scale(random_continuous) 65.6289989 65.6289989
random_categoricalyellow -181.0625958 181.0625958
random_categoricalred 338.8869605 338.8869605
random_categoricalorange 345.9045625 345.9045625
weekendWeekend 682.9016169 682.9016169
random_categoricalgreen 1007.3186698 1007.3186698
scale(Departure_Hour) -1379.4968917 1379.4968917
scale(Duration_Hours) -1478.3129209 1478.3129209
airport_code_first_stopDEL -1577.2935732 1577.2935732
scale(Departure_Day) 2208.3456998 2208.3456998
airport_code_first_stopGAU -2494.4066276 2494.4066276
airport_code_first_stopBLR -2821.2751534 2821.2751534
AirlineMultiple carriers 3406.4624653 3406.4624653
airport_code_third_stopNAG 3496.8071739 3496.8071739
airport_code_third_stopBHO 3520.7828031 3520.7828031
airport_code_first_stopCCU 3899.3224172 3899.3224172
airport_code_departureDEL 5803.5836240 5803.5836240
airport_code_first_stopBOM 7319.1422314 7319.1422314
airport_code_departureCCU 7867.2382976 7867.2382976

For my continuous variables, departure_date_time did a worse job predicting than the random continuous variable. For my categorical variables, nothing did worse.

Streamlined All Subsets Regression

predictor_variables <- c(# Continuous
                          "Departure_Day", "Departure_Month", "Departure_Hour",
                          "number_of_stops", "Duration_Hours",
                        # Dichotomous
                          "weekend",
                        # Lower-cardinality Categorical (keeping only essential ones)
                          "Airline", "airport_code_departure", "airport_code_first_stop", 
                          "airport_code_second_stop", "airport_code_third_stop", "airport_code_arrival")

outcome_variable <- "Price"

cat("=== STREAMLINED ALL SUBSETS REGRESSION ===\n")
=== STREAMLINED ALL SUBSETS REGRESSION ===
cat("Testing", length(predictor_variables), "predictor variables\n")
Testing 12 predictor variables
cat("This will test up to", 2^length(predictor_variables) - 1, "different model combinations\n")
This will test up to 4095 different model combinations
cat("Variables included:", paste(predictor_variables, collapse = ", "), "\n\n")
Variables included: Departure_Day, Departure_Month, Departure_Hour, number_of_stops, Duration_Hours, weekend, Airline, airport_code_departure, airport_code_first_stop, airport_code_second_stop, airport_code_third_stop, airport_code_arrival 

Data exploration function to check cardinality

check_variable_cardinality <- function(data = flight_data8, vars = predictor_variables) {
  if (!exists("flight_data8")) {
    cat("flight_data8 not found\n")
    return(NULL)
  }
  
  cardinality_results <- data.frame(
    Variable = character(),
    Unique_Values = numeric(),
    Warning = character(),
    stringsAsFactors = FALSE
  )
  
  for (var in vars) {
    if (var %in% names(data)) {
      unique_vals <- length(unique(data[[var]], na.rm = TRUE))
      warning_msg <- ifelse(unique_vals > 50, "High cardinality - may cause issues", "")
      
      cardinality_results <- rbind(cardinality_results, 
                                 data.frame(Variable = var, 
                                          Unique_Values = unique_vals, 
                                          Warning = warning_msg))
    } else {
      cardinality_results <- rbind(cardinality_results, 
                                 data.frame(Variable = var, 
                                          Unique_Values = NA, 
                                          Warning = "NOT FOUND in dataset"))
    }
  }
  
  return(cardinality_results)
}

cardinality_check <- check_variable_cardinality()
cardinality_check %>%
  kbl(caption = "Variable Cardinality Check") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), 
                full_width = FALSE, position = "left")
Variable Cardinality Check
Variable Unique_Values Warning
Departure_Day 10
Departure_Month 4
Departure_Hour 24
number_of_stops 4
Duration_Hours 366 High cardinality - may cause issues
weekend 2
Airline 11
airport_code_departure 5
airport_code_first_stop 41
airport_code_second_stop 26
airport_code_third_stop 10
airport_code_arrival 5

Method 1: Using the leaps package (most efficient)

flight_all_subsets_leaps <- function(data = flight_data8, outcome_var = "Price", 
                                     predictor_vars = predictor_variables, method = "exhaustive") {
  
  # Check if data exists
  if (!exists("flight_data8")) {
    stop("flight_data8 dataset not found. Please load your data first.")
  }
  
  # Remove rows with missing values in key variables
  analysis_data <- data %>%
    select(all_of(c(outcome_var, predictor_vars))) %>%
    na.omit()
  
  cat("Analysis dataset created with", nrow(analysis_data), "complete observations\n")
  cat("Original dataset had", nrow(data), "observations\n\n")
  
  # Run all subsets regression with really.big=TRUE for safety
  cat("Running all subsets regression...\n")
  regsubsets_result <- regsubsets(formula(paste(outcome_var, "~", paste(predictor_vars, collapse = " + "))), 
                                  data = analysis_data, 
                                  nbest = 1,        # Keep best model of each size
                                  nvmax = length(predictor_vars),  # Max variables
                                  method = method,   # "exhaustive", "forward", "backward"
                                  really.big = TRUE) # Handle large searches
  
  # Extract results
  summary_results <- summary(regsubsets_result)
  
  # Create results data frame
  results_df <- data.frame(
    n_variables = 1:length(summary_results$rsq),
    r_squared = summary_results$rsq,
    adj_r_squared = summary_results$adjr2,
    cp = summary_results$cp,
    bic = summary_results$bic,
    variables_included = apply(summary_results$which[,-1], 1, function(x) {
      paste(names(x)[x], collapse = ", ")
    })
  )
  
  # Find best models by different criteria
  best_rsq_idx <- which.max(results_df$r_squared)
  best_adj_rsq_idx <- which.max(results_df$adj_r_squared)
  best_cp_idx <- which.min(results_df$cp)
  best_bic_idx <- which.min(results_df$bic)
  
  # Add indicator columns
  results_df$best_rsq <- 1:nrow(results_df) == best_rsq_idx
  results_df$best_adj_rsq <- 1:nrow(results_df) == best_adj_rsq_idx
  results_df$best_cp <- 1:nrow(results_df) == best_cp_idx
  results_df$best_bic <- 1:nrow(results_df) == best_bic_idx
  
  return(list(
    results = results_df,
    regsubsets_object = regsubsets_result,
    analysis_data = analysis_data,
    best_models = list(
      highest_rsq = results_df[best_rsq_idx, ],
      best_adj_rsq = results_df[best_adj_rsq_idx, ],
      best_cp = results_df[best_cp_idx, ],
      best_bic = results_df[best_bic_idx, ]
    )
  ))
}

Alternative: Forward/Backward Selection (faster and handles dependencies better)

flight_stepwise_selection <- function(data = flight_data8, outcome_var = "Price", 
                                      predictor_vars = predictor_variables, method = "both") {
  
  # Check if data exists
  if (!exists("flight_data8")) {
    stop("flight_data8 dataset not found. Please load your data first.")
  }
  
  # Remove rows with missing values in key variables
  analysis_data <- data %>%
    select(all_of(c(outcome_var, predictor_vars))) %>%
    na.omit()
  
  cat("Stepwise selection dataset created with", nrow(analysis_data), "complete observations\n")
  cat("Original dataset had", nrow(data), "observations\n\n")
  
  # Create full model formula
  full_formula <- formula(paste(outcome_var, "~", paste(predictor_vars, collapse = " + ")))
  
  # Fit full model
  full_model <- lm(full_formula, data = analysis_data)
  
  # Perform stepwise selection
  cat("Performing stepwise selection (method =", method, ")...\n")
  if (method == "forward") {
    # Start with null model for forward selection
    null_model <- lm(formula(paste(outcome_var, "~ 1")), data = analysis_data)
    step_model <- step(null_model, scope = list(lower = null_model, upper = full_model), 
                       direction = "forward", trace = FALSE)
  } else if (method == "backward") {
    # Start with full model for backward selection
    step_model <- step(full_model, direction = "backward", trace = FALSE)
  } else {
    # Both directions
    step_model <- step(full_model, direction = "both", trace = FALSE)
  }
  
  # Get model summary
  step_summary <- summary(step_model)
  
  cat("\n=== STEPWISE SELECTION RESULTS ===\n")
  cat("Final model R-squared:", round(step_summary$r.squared, 4), "\n")
  cat("Final model Adjusted R-squared:", round(step_summary$adj.r.squared, 4), "\n")
  cat("Variables selected:", length(step_model$coefficients) - 1, "out of", length(predictor_vars), "\n")
  cat("Variables in final model:", paste(names(step_model$coefficients)[-1], collapse = ", "), "\n\n")
  
  return(list(
    model = step_model,
    summary = step_summary,
    aic = AIC(step_model),
    bic = BIC(step_model),
    formula = formula(step_model),
    variables_selected = names(step_model$coefficients)[-1]
  ))
}

Visualization function

plot_flight_model_comparison <- function(results_df, top_n = 15) {
  
  # Get top models
  top_models <- head(results_df, top_n)
  
  # Create the plot
  p <- top_models %>%
    mutate(variables_wrapped = str_wrap(variables_included, width = 50)) %>%
    mutate(variables_wrapped = fct_reorder(variables_wrapped, r_squared)) %>%
    ggplot(aes(x = variables_wrapped, y = r_squared)) +
    geom_bar(stat = 'identity', fill = 'steelblue', alpha = 0.7) +
    geom_text(aes(label = paste0(round(r_squared, 3))), 
              hjust = -.1, size = 3.5, color = "black", fontface = 'bold') +
    coord_flip() +
    theme_minimal() +
    theme(plot.title = element_text(size = 14, hjust = 0.5),
          plot.subtitle = element_text(hjust = 0.5),
          plot.caption = element_text(size = 9, color = "#666666"),
          axis.text.x = element_text(size = 10),
          axis.text.y = element_text(size = 8),
          axis.title.y = element_text(size = 11),
          axis.title.x = element_text(size = 11),
          panel.grid.minor = element_blank()) +
    labs(title = paste("Top", top_n, "Flight Price Prediction Models by R-squared"),
         x = "Variable Combinations", 
         y = "R-squared",
         caption = paste("Total combinations tested:", nrow(results_df)))
  
  return(p)
}

Helper function to get detailed results for best flight price model

get_best_flight_model_details <- function(data = flight_data8, outcome_var = "Price", model_variables_string) {
  
  # Parse the variable string to extract original variable names
  original_vars <- c()
  
  # Check for each of our streamlined predictor variables
  if (grepl("Airline", model_variables_string)) {
    original_vars <- c(original_vars, "Airline")
  }
  if (grepl("Departure_Day", model_variables_string)) {
    original_vars <- c(original_vars, "Departure_Day")
  }
  if (grepl("Departure_Month", model_variables_string)) {
    original_vars <- c(original_vars, "Departure_Month")
  }
  if (grepl("Departure_Hour", model_variables_string)) {
    original_vars <- c(original_vars, "Departure_Hour")
  }
  if (grepl("weekend", model_variables_string)) {
    original_vars <- c(original_vars, "weekend")
  }
  if (grepl("airport_code_departure", model_variables_string)) {
    original_vars <- c(original_vars, "airport_code_departure")
  }
  if (grepl("airport_code_first_stop", model_variables_string)) {
    original_vars <- c(original_vars, "airport_code_first_stop")
  }
  if (grepl("airport_code_second_stop", model_variables_string)) {
    original_vars <- c(original_vars, "airport_code_second_stop")
  }
  if (grepl("airport_code_third_stop", model_variables_string)) {
    original_vars <- c(original_vars, "airport_code_third_stop")
  }
  if (grepl("airport_code_arrival", model_variables_string)) {
    original_vars <- c(original_vars, "airport_code_arrival")
  }
  if (grepl("number_of_stops", model_variables_string)) {
    original_vars <- c(original_vars, "number_of_stops")
  }
  if (grepl("Duration_Hours", model_variables_string)) {
    original_vars <- c(original_vars, "Duration_Hours")
  }
  
  # Remove rows with missing values using original variable names
  analysis_data <- data %>%
    select(all_of(c(outcome_var, original_vars))) %>%
    na.omit()
  
  # Create formula with original variables
  formula_str <- paste(outcome_var, "~", paste(original_vars, collapse = " + "))
  
  # Fit the model
  best_model <- lm(as.formula(formula_str), data = analysis_data)
  
  # Get detailed summary
  model_summary <- summary(best_model)
  
  cat("=== BEST STREAMLINED FLIGHT PRICE PREDICTION MODEL ===\n")
  cat("Original Variables Used (", length(original_vars), "total):", paste(original_vars, collapse = ", "), "\n")
  cat("R-squared:", round(model_summary$r.squared, 4), "\n")
  cat("Adjusted R-squared:", round(model_summary$adj.r.squared, 4), "\n")
  cat("F-statistic:", round(model_summary$fstatistic[1], 4), "\n")
  cat("P-value:", format.pval(pf(model_summary$fstatistic[1], 
                                 model_summary$fstatistic[2], 
                                 model_summary$fstatistic[3], 
                                 lower.tail = FALSE)), "\n")
  cat("Number of observations:", nrow(analysis_data), "\n\n")
  
  return(best_model)
}

Final Results

Run the analysis on your flight data

if (exists("flight_data8")) {
  
  cat("=== RUNNING STREAMLINED ALL SUBSETS REGRESSION ON FLIGHT PRICE DATA ===\n\n")
  
  # First, check the cardinality of variables to identify potential issues
  check_variable_cardinality()
  
  # Method 1: Try exhaustive search with streamlined variables
  cat("Attempting exhaustive search with streamlined variables...\n")
  tryCatch({
    leaps_results <- flight_all_subsets_leaps()
    
    # Get detailed results for the model with highest R-squared
    best_model_rsq <- get_best_flight_model_details(model_variables_string = leaps_results$best_models$highest_rsq$variables_included)
    
    # Get detailed results for the model with best BIC (often better for prediction)
    best_model_bic <- get_best_flight_model_details(model_variables_string = leaps_results$best_models$best_bic$variables_included)
    
  }, error = function(e) {
    cat("Exhaustive search failed:", e$message, "\n")
    cat("Switching to stepwise selection method...\n\n")
    
    # Method 2: Stepwise selection (more robust for categorical variables)
    cat("=== RUNNING STEPWISE SELECTION (ALTERNATIVE METHOD) ===\n")
    
    # Forward selection
    cat("1. Forward Selection:\n")
    forward_results <- flight_stepwise_selection(method = "forward")
    
    # Backward selection  
    cat("\n2. Backward Selection:\n")
    backward_results <- flight_stepwise_selection(method = "backward")
    
    # Both directions
    cat("\n3. Bidirectional Selection:\n") 
    both_results <- flight_stepwise_selection(method = "both")
    
    # Compare the three methods
    cat("\n=== COMPARISON OF STEPWISE METHODS ===\n")
    comparison_df <- data.frame(
      Method = c("Forward", "Backward", "Both"),
      R_squared = c(summary(forward_results$model)$r.squared,
                    summary(backward_results$model)$r.squared, 
                    summary(both_results$model)$r.squared),
      Adj_R_squared = c(summary(forward_results$model)$adj.r.squared,
                        summary(backward_results$model)$adj.r.squared,
                        summary(both_results$model)$adj.r.squared),
      AIC = c(forward_results$aic, backward_results$aic, both_results$aic),
      BIC = c(forward_results$bic, backward_results$bic, both_results$bic),
      N_Variables = c(length(forward_results$variables_selected),
                      length(backward_results$variables_selected),
                      length(both_results$variables_selected))
    )
    print(comparison_df)
    
    # Recommend best model
    best_method_idx <- which.max(comparison_df$Adj_R_squared)
    cat("\nRECOMMENDED MODEL: ", comparison_df$Method[best_method_idx], 
        " (Highest Adjusted R-squared: ", round(comparison_df$Adj_R_squared[best_method_idx], 4), ")\n")
  })
  
} else {
  cat("ERROR: flight_data8 dataset not found!\n")
}

Results Summary - All Models Ranked by Number of Variables:

# Display results summary in a nice table
if (exists("leaps_results")) {
  leaps_results$results %>%
    select(n_variables, r_squared, adj_r_squared, cp, bic) %>%
    mutate(across(where(is.numeric), ~ round(.x, 4))) %>%
    kbl(caption = "All Subsets Regression Results Summary",
        col.names = c("# Variables", "R-squared", "Adj R-squared", "Cp", "BIC")) %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed"), 
                  full_width = FALSE, position = "left")
}
All Subsets Regression Results Summary
# Variables R-squared Adj R-squared Cp BIC
1 0.4723 0.4601 -54.4857 -21.1550
2 0.5813 0.5614 -49.7011 -27.7562
3 0.6597 0.6348 -45.6978 -33.2769
4 0.7190 0.6909 -42.1812 -38.0927
5 0.7655 0.7354 -38.9934 -42.4220
6 0.7802 0.7455 -36.6181 -41.5250
7 0.7939 0.7548 -34.2687 -40.6085
8 0.8072 0.7644 -31.9263 -39.8255
9 0.8152 0.7677 -29.7233 -37.9125
10 0.8216 0.7691 -27.5596 -35.6924
11 0.8320 0.7761 -25.2925 -34.6022
12 0.8377 0.7768 -23.1491 -32.3247
13 0.8402 0.7732 -21.0841 -29.2293

Best Models by Different Criteria:

if (exists("leaps_results")) {
  # Create a summary of best models
  best_models_summary <- data.frame(
    Criterion = c("Highest R-squared", "Best Adjusted R-squared", "Best BIC"),
    Variables = c(leaps_results$best_models$highest_rsq$n_variables,
                  leaps_results$best_models$best_adj_rsq$n_variables,
                  leaps_results$best_models$best_bic$n_variables),
    R_squared = c(leaps_results$best_models$highest_rsq$r_squared,
                  leaps_results$best_models$best_adj_rsq$r_squared,
                  leaps_results$best_models$best_bic$r_squared),
    Adj_R_squared = c(leaps_results$best_models$highest_rsq$adj_r_squared,
                      leaps_results$best_models$best_adj_rsq$adj_r_squared,
                      leaps_results$best_models$best_bic$adj_r_squared),
    BIC = c(leaps_results$best_models$highest_rsq$bic,
            leaps_results$best_models$best_adj_rsq$bic,
            leaps_results$best_models$best_bic$bic)
  )
  
  best_models_summary %>%
    mutate(across(where(is.numeric), ~ round(.x, 4))) %>%
    kbl(caption = "Best Models by Different Criteria") %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed"), 
                  full_width = FALSE, position = "left")
}
Best Models by Different Criteria
Criterion Variables R_squared Adj_R_squared BIC
Highest R-squared 13 0.8402 0.7732 -29.2293
Best Adjusted R-squared 12 0.8377 0.7768 -32.3247
Best BIC 5 0.7655 0.7354 -42.4220

Visualization of Top Models:

if (exists("leaps_results")) {
  plot_comparison <- plot_flight_model_comparison(leaps_results$results, top_n = 10)
  print(plot_comparison)
}

Final Model Details:

if (exists("best_model_rsq")) {
  # Create a clean coefficient table
  final_model_tidy <- tidy(best_model_rsq) %>%
    mutate(
      estimate = round(estimate, 2),
      std.error = round(std.error, 2),
      statistic = round(statistic, 2),
      p.value = case_when(
        p.value < 0.001 ~ "< 0.001",
        p.value < 0.01 ~ sprintf("%.3f", p.value),
        TRUE ~ sprintf("%.3f", p.value)
      ),
      significance = case_when(
        as.numeric(ifelse(p.value == "< 0.001", 0, p.value)) < 0.001 ~ "***",
        as.numeric(ifelse(p.value == "< 0.001", 0, p.value)) < 0.01 ~ "**",
        as.numeric(ifelse(p.value == "< 0.001", 0, p.value)) < 0.05 ~ "*",
        as.numeric(ifelse(p.value == "< 0.001", 0, p.value)) < 0.1 ~ ".",
        TRUE ~ ""
      )
    )

  final_model_tidy %>%
    kbl(caption = "Final Model Coefficients",
        col.names = c("Term", "Estimate", "Std Error", "t-statistic", "p-value", "Sig.")) %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed"), 
                  full_width = FALSE, position = "left") %>%
    scroll_box(width = "100%", height = "400px")

  # Model summary statistics
  model_summary_stats <- glance(best_model_rsq) %>%
    select(r.squared, adj.r.squared, sigma, statistic, p.value, df, nobs) %>%
    mutate(across(where(is.numeric), ~ round(.x, 4))) %>%
    pivot_longer(everything(), names_to = "Statistic", values_to = "Value")

  model_summary_stats %>%
    kbl(caption = "Final Model Summary Statistics") %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed"), 
                  full_width = FALSE, position = "left")
}
Final Model Summary Statistics
Statistic Value
r.squared 0.8435
adj.r.squared 0.7541
sigma 1469.5468
statistic 9.4314
p.value 0.0000
df 16.0000
nobs 45.0000

Key Findings

Enhanced Model Performance: - The enhanced model with route parsing and airport codes provides more granular insights into flight pricing - Route-specific variables (airport codes for stops) show significant predictive power - The comprehensive approach combines traditional statistical methods with data science techniques

Most Important Predictors:

  1. Number of stops - Remains the strongest predictor

  2. Airport codes - Specific departure and arrival airports significantly impact pricing

  3. Duration of flight - Flight length is a key cost driver

  4. Airline - Different carriers have distinct pricing strategies

  5. Route complexity - The specific path taken affects pricing beyond just number of stops

Methodological Insights: - Random variable comparison helped identify which predictors truly add value - Standardized regression coefficients revealed the relative importance of continuous vs categorical variables - All subsets regression vs stepwise selection provided different perspectives on optimal model selection

Business Insights: - Specific airport combinations create premium pricing opportunities - Route complexity beyond simple stop count influences pricing - Temporal factors (departure time, day, month) remain important but secondary to route characteristics - The enhanced model provides actionable insights for revenue optimization and pricing strategy